The common flu, although different from year to year often exhibits common patterns from year to year. We use CMU’s Delphi group’s API to collect data from 2000 to 2018 flu seasons in the United States. We will specifically focus on a weekly “wILI” value, which captures a weighted percent of influenza-like illness cases seen in the hospital in that given week.
CMU’s Delphi group’s API can be sourced by running the following command.
source(paste0("https://raw.githubusercontent.com/",
"cmu-delphi/delphi-epidata/main/src/client/delphi_epidata.R"))
This file provides a lists of functions in a more pythonic way - all functions are avaiable in the Epidata list, and can be accessed through the notation Epidata$function_name.
We aim to examine flu seasons from 2001 to 2018. Although the API can pull data from as far back as 2001. Note that influenza death estimation (which we are not looking at), change in 20101. Additionally, flu seasons often are examined October of the previous year to August of the year.
flu_data_raw <- Epidata$fluview(list('nat'), Epidata$range(200010,
201809))
Delphi has documentation on the outcome of the fluview function (and other functions), on their website. Regions can be found here and is related to this.
We can extract the desired information from the second item in the list (under the epidata name), into a data.frame with the following function extract_df. We’ve folded the code as it’s messy and not that informative.
extract_df code:
#' extract data frame from delphi's epidata "fluview" function
#'
#' @param epidata "epidata" output as desired by the fluview function (see https://cmu-delphi.github.io/delphi-epidata/api/fluview.html)
#'
#' returns data.frame with the same information
extract_df <- function(epidata){
# basic structure of data frame -----------------
nr <- length(epidata)
nc <- length(epidata[[1]])
colnames <- names(epidata[[1]])
# creating an empty data frame ------------------
## character and numeric columns
c_idx <- which(sapply(epidata[[1]], class) %in% c("character", "factor"))
n_idx <- (1:nc)[!(1:nc %in% c_idx)]
data <- data.frame(matrix(nrow = nr, ncol = 0))
for (col_idx in 1:nc){
if (col_idx %in% c_idx){
data <- cbind(data, data.frame(matrix("c", nrow = nr, ncol = 1)))
} else {
data <- cbind(data, data.frame(matrix(0, nrow = nr, ncol = 1)))
}
}
names(data) <- colnames
for (row_idx in 1:nr){
inner <- epidata[[row_idx]]
inner[sapply(inner, function(x) length(x) == 0L)] <- NA
# breaking into parts
data[row_idx,c_idx] <- unlist(inner[c_idx])
data[row_idx,n_idx] <- unlist(inner[n_idx])
}
return(data)
}
We create a data.frame for the flu data (percent of new cases) and clean it up a bit to emphasis the week of the year we’re looking at.
Below is an example of the data.
The set of code below is messing but let’s talk about a few things.
df <- flu_data %>% tibble::rownames_to_column()
#df
# usa population (by month) from https://fred.stlouisfed.org/series/POPTHM
pop <- read.csv("POPTHM.csv") %>%
mutate(MONTH = lubridate::month(DATE),
YEAR = lubridate::year(DATE))
df <- df %>% mutate(epiweek_month = lubridate::month(epiweek_date))
df_plus <- df %>% mutate(epiweek_year = as.numeric(epiweek_year)) %>%
left_join(pop, by = c("epiweek_month" = "MONTH", "epiweek_year" = "YEAR")) %>%
mutate(scaled_num_ili = POPTHM * wili/100)
# needs to be updated to do tidyverse also what is par?
# and what is confirmed (cumulated relative to what?)
# negative values for some output...
devtools::load_all(path = "../")
#?EpiCompare::cases_to_SIR
# change to days (to get 3 day flu)
# expand?
df2 <- df_plus %>% mutate(n_wili = round(POPTHM * wili / 100),
rowname = as.numeric(rowname)*7,
d_wili = wili/7) %>%
select(d_wili, epiweek_date)
stepwise_fill_in <- function(df2, step = 7){
# fill in with zeros
df_new <- data.frame(df2[1,])
for (r_idx in 1:nrow(df2)){
inner_df <- data.frame(d_wili = df2$d_wili[r_idx],
epiweek_date = df2$epiweek_date[r_idx] + 0:(step-1))
df_new <- rbind(df_new, inner_df)
}
df_new <- df_new[-1,]
return(df_new)
}
df3 <- stepwise_fill_in(df2)
df3 %>% head()
## d_wili epiweek_date
## 2 0.1811801 2000-03-06
## 3 0.1811801 2000-03-07
## 4 0.1811801 2000-03-08
## 5 0.1811801 2000-03-09
## 6 0.1811801 2000-03-10
## 7 0.1811801 2000-03-11
df4 <- df3 %>% mutate(month = lubridate::month(epiweek_date),
year = lubridate::year(epiweek_date)) %>%
left_join(pop, by = c("month" = "MONTH", "year" = "YEAR")) %>%
mutate(n_wili = round(POPTHM * d_wili / 100))
new <- df4 %>% tibble::rownames_to_column() %>%
rename(t = rowname, N = POPTHM, current = n_wili) %>%
mutate(confirmed = cumsum(current)) %>%
EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days)
select(t, X0, X1, X2)
new %>% mutate(t = as.numeric(t)) %>%
pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) +
ggplot2::theme_minimal()
new2 <- new %>%
mutate(N = X0 + X1 + X2,
confirmed = X2)
new2_R_to_S <- new2 %>% select(t, confirmed, N) %>%
cases_to_SIR(par = c("gamma" = 1/60)) # 3 months to be suspectible again (to the flu)
new2_R_to_S %>% head
## t confirmed N X0 X1 X2
## 1 1 0 281531 281531 0 0
## 2 2 179 281531 281352 179 0
## 3 3 442 281531 281089 442 0
## 4 4 813 281531 280718 804 9
## 5 5 1194 281531 280337 1165 29
## 6 6 1646 281531 279885 1600 46
new$X3 <- new2_R_to_S$X2
new_SIRS <- new
new_SIRS <- new_SIRS %>% mutate(
X0 = X0 + X3) %>%
dplyr::select(t, X0, X1, X2) %>%
dplyr::rename(S = X0, I = X1, R = X2)
new_SIRS %>%
ggplot(aes(x = S, y = I, z = R)) +
geom_path() +
coord_tern() +
theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
new_SIRS %>% pivot_longer(cols = c(S, I, R)) %>%
ggplot() + geom_line(aes(x = as.numeric(t),
y = value, group = name, color = name)) +
theme_minimal()
new %>% pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot() + geom_line(aes(x = as.numeric(t),
y = value, group = name, color = name)) +
theme_minimal()
# change to days (to get 3 day flu)
# expand?
df2 <- df_plus %>% mutate(n_wili = round(POPTHM * wili / 100),
rowname = as.numeric(rowname)*7,
d_wili = wili/7) %>%
select(d_wili, epiweek_date, season_year)
stepwise_fill_in <- function(df2, step = 7){
# fill in with zeros
df_new <- data.frame(df2[1,])
for (r_idx in 1:nrow(df2)){
inner_df <- data.frame(d_wili = df2$d_wili[r_idx],
epiweek_date = df2$epiweek_date[r_idx] + 0:(step-1),
season_year = df2$season_year[r_idx])
df_new <- rbind(df_new, inner_df)
}
df_new <- df_new[-1,]
return(df_new)
}
df3 <- df2 %>% group_by(season_year) %>% stepwise_fill_in()
df4 <- df3 %>% mutate(month = lubridate::month(epiweek_date),
year = lubridate::year(epiweek_date)) %>%
left_join(pop, by = c("month" = "MONTH", "year" = "YEAR")) %>%
mutate(n_wili = round(POPTHM * d_wili / 100))
new <- df4 %>% tibble::rownames_to_column() %>%
rename(t = rowname, N = POPTHM, current = n_wili) %>%
group_by(season_year) %>%
mutate(confirmed = cumsum(current)) %>%
EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days)
select(t, X0, X1, X2)
## Adding missing grouping variables: `season_year`
new_other <- df4 %>% tibble::rownames_to_column() %>%
rename(t = rowname, N = POPTHM, current = n_wili) %>%
mutate(confirmed = cumsum(current)) %>%
EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days)
select(t, X0, X1, X2)
new %>% mutate(t = as.numeric(t)) %>%
pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) +
ggplot2::theme_minimal()
new_other %>% mutate(t = as.numeric(t)) %>%
pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) +
ggplot2::theme_minimal()
new2 <- new %>%
mutate(N = X0 + X1 + X2,
confirmed = X2)
new2_R_to_S <- new2 %>% select(t, confirmed, N) %>%
cases_to_SIR(par = c("gamma" = 1/60)) # 3 months to be suspectible again (to the flu)
## Adding missing grouping variables: `season_year`
new2_R_to_S %>% head
## # A tibble: 6 x 7
## # Groups: season_year [1]
## season_year t confirmed N X0 X1 X2
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1999 1 0 281531 281531 0 0
## 2 1999 2 158 281531 281373 158 0
## 3 1999 3 449 281531 281082 445 4
## 4 1999 4 769 281531 280762 762 7
## 5 1999 5 1182 281531 280349 1162 20
## 6 1999 6 1673 281531 279858 1635 38
new$X3 <- new2_R_to_S$X2
new_SIRS <- new
new_SIRS <- new_SIRS %>% mutate(
X0 = X0 + X3) %>%
dplyr::select(t, X0, X1, X2) %>%
dplyr::rename(S = X0, I = X1, R = X2)
## Adding missing grouping variables: `season_year`
df_lims = data.frame(S = c(1,.4,.4), I = c(0,.6,0), R = c(0,0,.6))
new_SIRS %>%
ggplot(aes(x = S, y = I, z = R,
color = factor(season_year))) +
geom_path() +
coord_tern() +
theme_minimal() +
tern_limits(T=max(df_lims$I),
L=max(df_lims$S),
R=max(df_lims$R))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
plotly::ggplotly(new_SIRS %>% pivot_longer(cols = c(S, I, R)) %>%
ggplot() + geom_line(aes(x = as.numeric(t),
y = value, group = name, color = name)) +
theme_minimal() )
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
new %>% pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot() + geom_line(aes(x = as.numeric(t),
y = value, group = name, color = name)) +
theme_minimal()
new_other %>% pivot_longer(cols = c(X0, X1, X2)) %>%
filter(name == "X1") %>%
ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) +
theme_minimal()
how to connect wILI to the total US (https://www.cdc.gov/flu/about/burden/how-cdc-estimates.htm)